import math
import folium
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon, MultiPolygon
from folium.plugins import PolyLineTextPath
from geopy.distance import geodesic
import branca.colormap as cm
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import openmeteo_requests
import requests_cache
from retry_requests import retry
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupKFold, GridSearchCV, cross_val_score, cross_validate, train_test_split
from sklearn.compose import TransformedTargetRegressor
from IPython.display import display, HTML
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_columns', None)
Executive Summary¶
This notebook demonstrates an in-depth analysis of bike-sharing trips in Munich throughout 2023, aiming to uncover patterns and actionable insights from seemingly ordinary bike usage data. The analysis combines classical exploratory techniques visualising the results of the analysis in different ways with a machine learning model that captures interdependencies between variables derived from the dataset and predicts a target variable on previously unseen data. The key conclusions of the analysis align with common sense and practical expectations.
Key Highlights:
- Trip activity is concentrated in the city centre, with most trips starting and ending within several central polygons into which the city area is divided.
- Distributions of trip duration and distance follow expected patterns — longer trips are increasingly rare, in line with the intended use case of short urban journeys.
- Trip activity exhibits various seasonal patterns and can be strongly influenced by external factors such as public transport strikes.
- Under conservative assumptions, it is possible to reliably reconstruct some trip chains made by the same bike — even without explicit bike identifiers.
- Based on the reconstructed trip chains, idle time can be estimated in some cases. Its distribution agrees with common sense, and detailed analysis of the outliers may offer actionable insights.
- Simple features derived from trip and weather data allow for accurate prediction of trip activity, as shown by fairly good model performance on unseen data.
Data Loading and Cleaning¶
The goal of this section is to have a look at the data, ensure correct data format and type (pandas dtype), do plausibility checks and in general make sure that we can rely on the data in the subsequent sections.
data = pd.read_csv('MVG_Rad_Fahrten_2023.csv', sep=';', decimal=',')
data
/var/folders/0j/hc4b4c153dzfbvkzcxfbtjv80000gn/T/ipykernel_1324/76297985.py:1: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.
data = pd.read_csv('MVG_Rad_Fahrten_2023.csv', sep=';', decimal=',')
| Row | STARTTIME | ENDTIME | STARTLAT | STARTLON | ENDLAT | ENDLON | RENTAL_IS_STATION | RENTAL_STATION_NAME | RETURN_IS_STATION | RETURN_STATION_NAME | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2023-01-01 00:26 | 2023-01-01 00:51 | 48.13795 | 11.54569 | 48.16123 | 11.55782 | 0 | 1 | Barbarastr | |
| 1 | 2 | 2023-01-01 00:30 | 2023-01-01 00:42 | 48.12903 | 11.54431 | 48.14797 | 11.53445 | 0 | 0 | ||
| 2 | 3 | 2023-01-01 00:32 | 2023-01-01 00:45 | 48.16841 | 11.55566 | 48.16467 | 11.57649 | 0 | 0 | ||
| 3 | 4 | 2023-01-01 00:34 | 2023-01-01 00:46 | 48.16843 | 11.55567 | 48.16464 | 11.57648 | 0 | 0 | ||
| 4 | 5 | 2023-01-01 00:35 | 2023-01-01 00:51 | 48.17104 | 11.54878 | 48.16243 | 11.53007 | 0 | 0 | ||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 710101 | 710102 | 2023-12-31 23:52 | 2023-12-31 23:57 | 48.16719 | 11.55854 | 48.16917 | 11.55547 | 0 | 0 | ||
| 710102 | 710103 | 2023-12-31 23:52 | 2024-01-01 00:07 | 48.17061 | 11.57391 | 48.17334 | 11.55952 | 0 | 0 | ||
| 710103 | 710104 | 2023-12-31 23:53 | 2023-12-31 23:57 | 48.14131 | 11.56144 | 48.14094 | 11.56044 | 0 | 0 | ||
| 710104 | 710105 | 2023-12-31 23:54 | 2023-12-31 23:59 | 48.12353 | 11.54494 | 48.12674 | 11.54758 | 0 | 0 | ||
| 710105 | 710106 | 2023-12-31 23:54 | 2024-01-01 00:06 | 48.12376 | 11.54871 | 48.12376 | 11.54871 | 1 | Kreisverwaltungsreferat | 1 | Kreisverwaltungsreferat |
710106 rows × 11 columns
The dataset contains detailed information about bike trips, with the following key variables:
- Trip timing: Start and end times of each journey
- Location data: Coordinates (latitude/longitude) for both start and end points
- Station information: Names of rental and return stations (when applicable)
# column names contain trailing spaces
data.columns = data.columns.str.strip()
# drop the row column, but duplicate index values in a separate column. This will make some analysis more convenient
data.drop(columns=['Row'], inplace=True)
data.reset_index(names='trip_index', inplace=True)
data
| trip_index | STARTTIME | ENDTIME | STARTLAT | STARTLON | ENDLAT | ENDLON | RENTAL_IS_STATION | RENTAL_STATION_NAME | RETURN_IS_STATION | RETURN_STATION_NAME | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2023-01-01 00:26 | 2023-01-01 00:51 | 48.13795 | 11.54569 | 48.16123 | 11.55782 | 0 | 1 | Barbarastr | |
| 1 | 1 | 2023-01-01 00:30 | 2023-01-01 00:42 | 48.12903 | 11.54431 | 48.14797 | 11.53445 | 0 | 0 | ||
| 2 | 2 | 2023-01-01 00:32 | 2023-01-01 00:45 | 48.16841 | 11.55566 | 48.16467 | 11.57649 | 0 | 0 | ||
| 3 | 3 | 2023-01-01 00:34 | 2023-01-01 00:46 | 48.16843 | 11.55567 | 48.16464 | 11.57648 | 0 | 0 | ||
| 4 | 4 | 2023-01-01 00:35 | 2023-01-01 00:51 | 48.17104 | 11.54878 | 48.16243 | 11.53007 | 0 | 0 | ||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 710101 | 710101 | 2023-12-31 23:52 | 2023-12-31 23:57 | 48.16719 | 11.55854 | 48.16917 | 11.55547 | 0 | 0 | ||
| 710102 | 710102 | 2023-12-31 23:52 | 2024-01-01 00:07 | 48.17061 | 11.57391 | 48.17334 | 11.55952 | 0 | 0 | ||
| 710103 | 710103 | 2023-12-31 23:53 | 2023-12-31 23:57 | 48.14131 | 11.56144 | 48.14094 | 11.56044 | 0 | 0 | ||
| 710104 | 710104 | 2023-12-31 23:54 | 2023-12-31 23:59 | 48.12353 | 11.54494 | 48.12674 | 11.54758 | 0 | 0 | ||
| 710105 | 710105 | 2023-12-31 23:54 | 2024-01-01 00:06 | 48.12376 | 11.54871 | 48.12376 | 11.54871 | 1 | Kreisverwaltungsreferat | 1 | Kreisverwaltungsreferat |
710106 rows × 11 columns
# Convert STARTTIME and ENDTIME to datetime64. dtype of the rest of the columns is already correct
for col in ['STARTTIME', 'ENDTIME']:
data[col] = pd.to_datetime(data[col])
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 710106 entries, 0 to 710105 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 trip_index 710106 non-null int64 1 STARTTIME 710106 non-null datetime64[ns] 2 ENDTIME 710106 non-null datetime64[ns] 3 STARTLAT 710106 non-null float64 4 STARTLON 710106 non-null float64 5 ENDLAT 710106 non-null float64 6 ENDLON 710106 non-null float64 7 RENTAL_IS_STATION 710106 non-null object 8 RENTAL_STATION_NAME 710106 non-null object 9 RETURN_IS_STATION 710106 non-null object 10 RETURN_STATION_NAME 710106 non-null object dtypes: datetime64[ns](2), float64(4), int64(1), object(4) memory usage: 59.6+ MB
# Values in these columns also contain trailing spaces
for col in ['RENTAL_IS_STATION', 'RENTAL_STATION_NAME', 'RETURN_IS_STATION', 'RETURN_STATION_NAME']:
data[col] = data[col].str.strip()
data.query('RENTAL_IS_STATION.isna()')['RENTAL_STATION_NAME'].value_counts(dropna=False)
data.query('RETURN_IS_STATION.isna()')['RETURN_STATION_NAME'].value_counts(dropna=False)
# RENTAL_IS_STATION is not reliable since it can be NA when the trip starts at a station.
# So let's just disregard RENTAL_IS_STATION and RETURN_IS_STATION and drop them.
# Whether the trip started or finished at a station can be concluded from RENTAL_STATION_NAME and RETURN_STATION_NAME.
data.drop(columns=['RETURN_IS_STATION', 'RENTAL_IS_STATION'], inplace=True)
RENTAL_STATION_NAME
51366
Sandstraße 327
TUM Arcisstraße 250
Hauptbahnhof Nord 231
Olympiazentrum 216
...
Waldsiedlung Faistenhaar 1
Ferd.-Kobell-Straße Haar 1
Ottobrunner Straße Faistenhaar 1
S-Bahnhof Wächterhof 1
Mallertshoffener Unterschleißheim 1
Name: count, Length: 315, dtype: int64
Series([], Name: count, dtype: int64)
# Let's make sure each station has the same coordinates in the entire dataset
all_stations_with_coord = pd.concat([data[['RENTAL_STATION_NAME', 'STARTLAT', 'STARTLON']].rename(columns={'RENTAL_STATION_NAME': 'station',
'STARTLAT': 'lat',
'STARTLON': 'lon'}),
data[['RETURN_STATION_NAME', 'ENDLAT', 'ENDLON']].rename(columns={'RETURN_STATION_NAME': 'station',
'ENDLAT': 'lat',
'ENDLON': 'lon'})], axis=0)
std_deviations = all_stations_with_coord.groupby('station').std()
std_deviations.head()
std_deviations.query('lat != 0 | lon != 0')
# Indeed, only for the. points that are not stations, the standard deviation of their coordinates is different from 0.
# Otherwise it is zero, so the coordinates of all stations are consistent.
| lat | lon | |
|---|---|---|
| station | ||
| 5.698372 | 3.448503 | |
| AGROB Nord Ismaning | 0.000000 | 0.000000 |
| AGROB Süd Ismaning | 0.000000 | 0.000000 |
| Ackermannstraße | 0.000000 | 0.000000 |
| Ahornring Taufkirchen | 0.000000 | 0.000000 |
| lat | lon | |
|---|---|---|
| station | ||
| 5.698372 | 3.448503 |
data.info()
data.describe()
# We have no NA values, but what is clearly implausible is some coordinates in all coordinate columns. Some of the are equal to 0, and some are even negative
<class 'pandas.core.frame.DataFrame'> RangeIndex: 710106 entries, 0 to 710105 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 trip_index 710106 non-null int64 1 STARTTIME 710106 non-null datetime64[ns] 2 ENDTIME 710106 non-null datetime64[ns] 3 STARTLAT 710106 non-null float64 4 STARTLON 710106 non-null float64 5 ENDLAT 710106 non-null float64 6 ENDLON 710106 non-null float64 7 RENTAL_STATION_NAME 710106 non-null object 8 RETURN_STATION_NAME 710106 non-null object dtypes: datetime64[ns](2), float64(4), int64(1), object(2) memory usage: 48.8+ MB
| trip_index | STARTTIME | ENDTIME | STARTLAT | STARTLON | ENDLAT | ENDLON | |
|---|---|---|---|---|---|---|---|
| count | 710106.000000 | 710106 | 710106 | 710106.000000 | 710106.000000 | 710106.000000 | 710106.000000 |
| mean | 355052.500000 | 2023-07-03 20:01:02.147932928 | 2023-07-03 20:28:35.743255296 | 47.610000 | 11.772959 | 47.589505 | 11.720168 |
| min | 0.000000 | 2023-01-01 00:26:00 | 2023-01-01 00:42:00 | 0.000000 | -71.178000 | -55.973800 | -99.259350 |
| 25% | 177526.250000 | 2023-05-06 16:43:00 | 2023-05-06 17:17:15 | 48.128250 | 11.549040 | 48.128300 | 11.549040 |
| 50% | 355052.500000 | 2023-07-04 19:35:00 | 2023-07-04 19:54:00 | 48.143190 | 11.567820 | 48.143190 | 11.567710 |
| 75% | 532578.750000 | 2023-09-07 20:35:00 | 2023-09-07 21:04:00 | 48.159040 | 11.584500 | 48.159020 | 11.584430 |
| max | 710105.000000 | 2023-12-31 23:54:00 | 2024-01-01 16:00:00 | 53.094660 | 141.353220 | 53.094660 | 141.353220 |
| std | 204990.089464 | NaN | NaN | 5.045196 | 3.141217 | 5.170640 | 3.034583 |
# We see that within 1.5% and 99%- quantiles the coordinates are roughly plausible, at least up to their integer parts.
data[['STARTLAT', 'STARTLON', 'ENDLAT', 'ENDLON']].quantile(0.015)
data[['STARTLAT', 'STARTLON', 'ENDLAT', 'ENDLON']].quantile(0.99)
STARTLAT 48.03886 STARTLON 11.46081 ENDLAT 48.03864 ENDLON 11.45315 Name: 0.015, dtype: float64
STARTLAT 48.24959 STARTLON 11.73105 ENDLAT 48.24959 ENDLON 11.72308 Name: 0.99, dtype: float64
After visual inspection of the data points on a map, the geographic boundaries of the area to be analysed were set as follows:
- Latitude range: 47.9°N to 48.32°N
- Longitude range: 11.2°E to 12.0°E
This rectangular area encompasses the entire city of Munich and its immediate neighbouring towns.
ranges = {'STARTLAT': (47.9, 48.32), 'ENDLAT': (47.9, 48.32), 'STARTLON': (11.2, 12), 'ENDLON': (11.2, 12)}
query_str = ' & '.join([f'{col} >= {low} & {col} <= {high}' for col, (low, high) in ranges.items()])
data_filtered = data.query(query_str).copy()
data_filtered
data_filtered[['STARTLAT', 'STARTLON', 'ENDLAT', 'ENDLON']].apply([min, max])
| trip_index | STARTTIME | ENDTIME | STARTLAT | STARTLON | ENDLAT | ENDLON | RENTAL_STATION_NAME | RETURN_STATION_NAME | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2023-01-01 00:26:00 | 2023-01-01 00:51:00 | 48.13795 | 11.54569 | 48.16123 | 11.55782 | Barbarastr | |
| 1 | 1 | 2023-01-01 00:30:00 | 2023-01-01 00:42:00 | 48.12903 | 11.54431 | 48.14797 | 11.53445 | ||
| 2 | 2 | 2023-01-01 00:32:00 | 2023-01-01 00:45:00 | 48.16841 | 11.55566 | 48.16467 | 11.57649 | ||
| 3 | 3 | 2023-01-01 00:34:00 | 2023-01-01 00:46:00 | 48.16843 | 11.55567 | 48.16464 | 11.57648 | ||
| 4 | 4 | 2023-01-01 00:35:00 | 2023-01-01 00:51:00 | 48.17104 | 11.54878 | 48.16243 | 11.53007 | ||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 710101 | 710101 | 2023-12-31 23:52:00 | 2023-12-31 23:57:00 | 48.16719 | 11.55854 | 48.16917 | 11.55547 | ||
| 710102 | 710102 | 2023-12-31 23:52:00 | 2024-01-01 00:07:00 | 48.17061 | 11.57391 | 48.17334 | 11.55952 | ||
| 710103 | 710103 | 2023-12-31 23:53:00 | 2023-12-31 23:57:00 | 48.14131 | 11.56144 | 48.14094 | 11.56044 | ||
| 710104 | 710104 | 2023-12-31 23:54:00 | 2023-12-31 23:59:00 | 48.12353 | 11.54494 | 48.12674 | 11.54758 | ||
| 710105 | 710105 | 2023-12-31 23:54:00 | 2024-01-01 00:06:00 | 48.12376 | 11.54871 | 48.12376 | 11.54871 | Kreisverwaltungsreferat | Kreisverwaltungsreferat |
694657 rows × 9 columns
| STARTLAT | STARTLON | ENDLAT | ENDLON | |
|---|---|---|---|---|
| min | 47.91573 | 11.20252 | 47.91573 | 11.20252 |
| max | 48.31310 | 11.95886 | 48.31479 | 11.95886 |
Previously we checked if all stations have consistent coordinates. Let us also ensure the opposite: whether the station names are available for all known station locations. In some cases they may be missing even though the coordinates correspond to a known station.
# First, get coordinates for all stations, n_trips will denote the number of trips that started at that station
station_coordinates = data_filtered.query('RENTAL_STATION_NAME != ""')[['RENTAL_STATION_NAME', 'STARTLAT', 'STARTLON']].groupby('RENTAL_STATION_NAME')\
.agg(latitude=pd.NamedAgg(column="STARTLAT", aggfunc="first"),
longitude=pd.NamedAgg(column="STARTLON", aggfunc="first"),
n_trips=pd.NamedAgg(column="STARTLAT", aggfunc="count"))
station_coordinates = station_coordinates.reset_index().rename(columns={'RENTAL_STATION_NAME': 'STATION_NAME'})
station_coordinates
| STATION_NAME | latitude | longitude | n_trips | |
|---|---|---|---|---|
| 0 | AGROB Nord Ismaning | 48.21102 | 11.66092 | 73 |
| 1 | AGROB Süd Ismaning | 48.20836 | 11.65885 | 151 |
| 2 | Ackermannstraße | 48.16824 | 11.56469 | 828 |
| 3 | Ahornring Taufkirchen | 48.04797 | 11.59898 | 74 |
| 4 | Ainmillerstraße | 48.15904 | 11.57756 | 707 |
| ... | ... | ... | ... | ... |
| 326 | Würmtalstraße Gräfelfing | 48.11545 | 11.44003 | 163 |
| 327 | ZHS Olympiazentrum | 48.18079 | 11.54656 | 663 |
| 328 | Zenettiplatz | 48.12452 | 11.55557 | 1176 |
| 329 | Zugspitzstraße Neuried | 48.08800 | 11.46974 | 110 |
| 330 | astopark | 48.09174 | 11.28297 | 339 |
331 rows × 4 columns
# Do two joins on coordinates and copy the station name to RENTAL_STATION_NAME and RETURN_STATION_NAME
# Now we will have some NAs in new columns RENTAL_STATION_NAME and RETURN_STATION_NAME (previously they were empty strings).
data_filtered = data_filtered.merge(
station_coordinates[['STATION_NAME', 'latitude', 'longitude']],
left_on=['STARTLAT', 'STARTLON'], right_on=['latitude', 'longitude'],
how='left'
)
data_filtered['RENTAL_STATION_NAME'] = data_filtered['STATION_NAME']
data_filtered.drop(columns=['STATION_NAME', 'latitude', 'longitude'], inplace=True)
data_filtered = data_filtered.merge(
station_coordinates[['STATION_NAME', 'latitude', 'longitude']],
left_on=['ENDLAT', 'ENDLON'], right_on=['latitude', 'longitude'],
how='left'
)
data_filtered['RETURN_STATION_NAME'] = data_filtered['STATION_NAME']
data_filtered.drop(columns=['STATION_NAME', 'latitude', 'longitude'], inplace=True)
data_filtered
| trip_index | STARTTIME | ENDTIME | STARTLAT | STARTLON | ENDLAT | ENDLON | RENTAL_STATION_NAME | RETURN_STATION_NAME | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2023-01-01 00:26:00 | 2023-01-01 00:51:00 | 48.13795 | 11.54569 | 48.16123 | 11.55782 | NaN | Barbarastr |
| 1 | 1 | 2023-01-01 00:30:00 | 2023-01-01 00:42:00 | 48.12903 | 11.54431 | 48.14797 | 11.53445 | NaN | NaN |
| 2 | 2 | 2023-01-01 00:32:00 | 2023-01-01 00:45:00 | 48.16841 | 11.55566 | 48.16467 | 11.57649 | NaN | NaN |
| 3 | 3 | 2023-01-01 00:34:00 | 2023-01-01 00:46:00 | 48.16843 | 11.55567 | 48.16464 | 11.57648 | NaN | NaN |
| 4 | 4 | 2023-01-01 00:35:00 | 2023-01-01 00:51:00 | 48.17104 | 11.54878 | 48.16243 | 11.53007 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 694652 | 710101 | 2023-12-31 23:52:00 | 2023-12-31 23:57:00 | 48.16719 | 11.55854 | 48.16917 | 11.55547 | NaN | NaN |
| 694653 | 710102 | 2023-12-31 23:52:00 | 2024-01-01 00:07:00 | 48.17061 | 11.57391 | 48.17334 | 11.55952 | NaN | NaN |
| 694654 | 710103 | 2023-12-31 23:53:00 | 2023-12-31 23:57:00 | 48.14131 | 11.56144 | 48.14094 | 11.56044 | NaN | NaN |
| 694655 | 710104 | 2023-12-31 23:54:00 | 2023-12-31 23:59:00 | 48.12353 | 11.54494 | 48.12674 | 11.54758 | NaN | NaN |
| 694656 | 710105 | 2023-12-31 23:54:00 | 2024-01-01 00:06:00 | 48.12376 | 11.54871 | 48.12376 | 11.54871 | Kreisverwaltungsreferat | Kreisverwaltungsreferat |
694657 rows × 9 columns
# Resulting value counts for RENTAL_STATION_NAME and RETURN_STATION_NAME: as will be discussed later, most trips start and end outside stations
data_filtered['RENTAL_STATION_NAME'].value_counts(dropna=False)
data_filtered['RETURN_STATION_NAME'].value_counts(dropna=False)
RENTAL_STATION_NAME
NaN 538323
Sandstraße 3476
TUM Arcisstraße 2623
Hauptbahnhof Nord 2601
Münchner Freiheit 2402
...
Tölzer Straße Otterloh 17
Domagkstraße Kirchheim 12
Am Sportpark Feldkirchen 11
Waldsiedlung Faistenhaar 9
Bogenstraße Waldbrunn 4
Name: count, Length: 332, dtype: int64
RETURN_STATION_NAME
NaN 571970
Sandstraße 2966
Hauptbahnhof Nord 2456
TUM Arcisstraße 2134
Olympiazentrum 2091
...
Am Sportpark Feldkirchen 13
Ferd.-Kobell-Straße Haar 12
Parkplatz Grünwald 11
Waldsiedlung Faistenhaar 7
Bogenstraße Waldbrunn 3
Name: count, Length: 332, dtype: int64
Visualization: Trip Start and End Locations¶
This visualization displays a sample of locations where bike trips began and ended in 2023. Key features of the plot:
- Marker size represents the frequency of trips at each location
- Data is aggregated by location before sampling
- Larger markers typically indicate station locations, where there is natural concentration of trip starts and ends
- Note: Due to the sampling process, some high-traffic stations might not appear in this visualization
Use the zoom controls to explore specific areas in detail.
def plot_location_counts(data, latitude_column, longitude_column, marker_size, title, **kwargs):
# Aggregate the data by location
agg_data = data[[latitude_column, longitude_column]].groupby([latitude_column, longitude_column]).size().reset_index(name='count')
# Normalize the count to scale marker sizes
max_count = agg_data['count'].max()
agg_data['size'] = agg_data['count'] / max_count * marker_size
# Create a Folium map
m = folium.Map(location=[data[latitude_column].mean(), data[longitude_column].mean()], zoom_start=12)
# Add markers to the map
for i, row in agg_data.sample(**kwargs).iterrows():
_ = folium.CircleMarker(
location=[row[latitude_column], row[longitude_column]],
radius=row['size'],
color='blue',
fill=True,
fill_color='blue',
fill_opacity=0.6,
popup=f"Count: {row['count']}"
).add_to(m);
display(HTML(title))
display(m)
%%time
sample_size = 20000
plot_location_counts(data_filtered, 'STARTLAT', 'STARTLON', 50, "<h3>Trip Start Locations (marker size reflects the number of trip starts)</h3>", n=sample_size)
plot_location_counts(data_filtered, 'ENDLAT', 'ENDLON', 50, "<h3>Trip End Locations (marker size reflects the number of trip ends)</h3>", n=sample_size)